library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(rpart)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
vaccine<-read.csv("COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv")
case_death<-read.csv("United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")
USstate=c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS",
"KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY",
"NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV",
"WI","WY")
case_death<-case_death %>%
transmute(date=mdy(submission_date),
state=state,
tot_case=tot_cases,
tot_death=tot_death,
new_case=new_case,
new_death=new_death
) %>%
filter(state %in% USstate) %>%
filter(date >= as.Date("2021-01-01") & date<=as.Date("2021-11-30"))
vaccine<-vaccine %>%
dplyr::select(c(1,3,13,14,15,16,17,18,19,26,28,30,32,34,36,38,40,42,43,44,46,47,48,50,51,52,54,55,56,58,60,62,64,66,67,68)) %>%
rename(state=Location) %>%
mutate(date=mdy(Date)) %>%
dplyr::select(-Date)
Combine two tables into #comb# #new_comb_ma# contains only data in MA
comb<-right_join(vaccine,case_death,by=c("date","state"))
#write.csv(comb,"bigtable.csv",row.names = FALSE)
new_comb_ma<-comb%>%
filter(state=="MA")%>%
arrange(date)
#mutate(new_case=tot_case-lag(tot_case)) %>%
#mutate(new_death=tot_death-lag(tot_death))%>%
#mutate(new_adm=Administered-lag(Administered)) %>%
#mutate(new_jj=Administered_Janssen-lag(Administered_Janssen)) %>%
#mutate(new_moderna=Administered_Moderna-lag(Administered_Moderna))%>%
#mutate(new_pfizer=Administered_Pfizer-lag(Administered_Pfizer))
The number of new cases against date in MA
new_comb_ma%>%ggplot(aes(x=date))+
geom_point(aes(y=new_case))+
geom_vline(xintercept=as.Date("2021-07-03"),color="red")+
geom_text(aes(x=as.Date("2021-04-30"),y=6700,label="Delta variant became dominant"))
The number of new death against date in MA
new_comb_ma%>%ggplot(aes(date,new_death))+
geom_point()+
geom_vline(xintercept=as.Date("2021-07-03"),color="red")+
geom_text(aes(x=as.Date("2021-08-30"),y=70,label="Delta variant became dominant"))
The change of new cases across all states
comb%>%
ggplot(aes(date,new_case,group=state))+
geom_line(color='darkseagreen3',alpha=0.6)+
geom_vline(xintercept=as.Date("2021-07-03"),color="red")+
geom_text(aes(x=as.Date("2021-04-30"),y=40000,label="Delta variant became dominant"))
The change of new deaths across all states
comb%>%
ggplot(aes(date,new_death,group=state))+
geom_line(color='darkseagreen3',alpha=0.6)+
geom_vline(xintercept=as.Date("2021-07-03"),color="red")+
geom_text(aes(x=as.Date("2021-08-30"),y=700,label="Delta variant became dominant"))
# Distinguish states and represent new case using color
library(RColorBrewer)
comb %>% ggplot(aes(date, state, fill = new_case)) +
geom_tile(color = "grey50") +
scale_x_continuous(expand = c(0,0)) +
scale_fill_gradientn(colors = brewer.pal(9, "Oranges"), trans = "sqrt") +
#geom_text(aes(x=as.Date("2021-08-30"), y=700, label="Delta variant became dominant")) +
theme_minimal() + theme(panel.grid = element_blank()) +
xlab("") + ylab("")
## Warning in self$trans$transform(x): 产生了NaNs
## Warning: Transformation introduced infinite values in discrete y-axis
Now, we are going to do some visualizations using maps.
# Pull out us state map data frame
us_states <- map_data("state")
us_states %>% ggplot(aes(x = long, y = lat, fill = region, group = group)) +
geom_polygon(color = "white", fill = "grey") +
coord_fixed(1.3) +
guides(fill = FALSE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
Before joining the map data with the CDC data, we need to make sure that the states match successfully with each other whenever possible.
x <- case_death$state
case_death <- case_death %>% mutate(region = tolower(state.name[match(x, state.abb)]))
y <- vaccine$state
vaccine <- vaccine %>% mutate(region = tolower(state.name[match(y, state.abb)])) %>%
mutate()
Then we can combine the two dataset using join functions.
# Choose two dates (one before delta dominant, one after)
dates <- c(ymd("2021-06-01", "2021-09-01"))
# Filter case_death data for the two dates and join with us_states data frame.
comb_cd <- case_death %>% filter(date %in% dates)
comb_cd <- left_join(comb_cd, us_states, by = "region")
# Filter vaccine data for the two dates and join with us_states data frame.
comb_vac <- vaccine %>% filter(date %in% dates)
comb_vac <- left_join(comb_vac, us_states, by = "region")
First we explore the map of new cases and deaths.
# Heatmap of new cases on June 1, 2021 and Sept. 1, 2021.
comb_cd %>% ggplot(aes(x = long, y = lat, group = group, fill = new_case)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
facet_grid(date ~ .) +
ggtitle("Map of new cases") +
scale_fill_viridis_c(name = "New cases", option = "inferno", direction = -1, trans = "sqrt")
# Heatmap of new deaths on June 1, 2021 and Sept. 1, 2021.
comb_cd %>% ggplot(aes(x = long, y = lat, group = group, fill = new_death)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
facet_grid(date ~ .) +
ggtitle("Map of new deaths") +
scale_fill_viridis_c(name = "New deaths", option = "rocket", direction = -1, trans = "sqrt")
## Warning in self$trans$transform(x): 产生了NaNs
## Warning: Transformation introduced infinite values in discrete y-axis
Then we explore the map of vaccination.
# Filter case_death data on Sept. 1, 2021 and join with us_states data frame.
comb_cd <- case_death %>% filter(date == "2021-09-01")
comb_cd <- left_join(comb_cd, us_states, by = "region")
# Filter vaccine data for the two dates and join with us_states data frame.
comb_vac <- vaccine %>% filter(date == "2021-09-01")
comb_vac <- left_join(comb_vac, us_states, by = "region")
# Heatmap of vaccine (regardless of brands) on June 1, 2021 and Sept. 1, 2021.
comb_vac %>% ggplot(aes(x = long, y = lat, group = group, fill = Administered)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
ggtitle("Map of total vaccines for each state") +
scale_fill_viridis_c(name = "Vaccination", option = "mako", direction = -1, trans = "log")
See the difference between vaccine brands.
# Heatmap of vaccine (Moderna) on Sept. 1, 2021.
p1 <- comb_vac %>% ggplot(aes(x = long, y = lat, group = group, fill = Administered_Moderna)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
ggtitle("Map of Moderna vaccinations") +
scale_fill_viridis_c(name = "Vaccination", option = "mako", direction = -1, trans = "log")
# Heatmap of vaccine (Pfizer) on Sept. 1, 2021.
p2 <- comb_vac %>% ggplot(aes(x = long, y = lat, group = group, fill = Administered_Pfizer)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
ggtitle("Map of Pfizer vaccinations") +
scale_fill_viridis_c(name = "Vaccination", option = "mako", direction = -1, trans = "log")
grid.arrange(p1, p2, ncol = 1)
## Warning: Transformation introduced infinite values in discrete y-axis
Classify the number of new cases into three categories (1: new case<=10000 on that day, 2: 10000<new case<=20000 on that day, 3: new case>20000 on that day) Also classify the number of new deaths into three categories (1: new death<=200 on that day, 2: 200<new death<=400 on that day, 3: new death>400 on that day)
clascomb<-comb%>%
mutate(new_case_clas=ifelse(new_case<=10000,1,ifelse(new_case>20000,3,2)))%>%
mutate(new_death_clas=ifelse(new_death<=200,1,ifelse(new_death>400,3,2)))
Randomly divide the table into one train set (70% data) and one test set(30% data)
inTrain <- createDataPartition(y = clascomb$new_case, p = 0.7, times = 1, list = FALSE)
train_set <- slice(clascomb, inTrain)
test_set <- slice(clascomb, -inTrain)
Classification using decision tree model
rpart_fit <- rpart(new_case_clas ~ Administered, data = train_set)
rpart_preds <- predict(rpart_fit, test_set)
rpart_res<-ifelse(rpart_preds<1.5,1,ifelse(rpart_preds>=2.5,3,2))
#rpart_preds
confusionMatrix(data = as.factor(rpart_res), reference = as.factor(test_set$new_case_clas),positive="1")
## Warning in levels(reference) != levels(data): 长的对象长度不是短的对象长度的整倍
## 数
## Warning in confusionMatrix.default(data = as.factor(rpart_res), reference
## = as.factor(test_set$new_case_clas), : Levels are not in the same order for
## reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 4850 51 26
## 2 61 13 7
## 3 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.971
## 95% CI : (0.966, 0.9755)
## No Information Rate : 0.9806
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1733
##
## Mcnemar's Test P-Value : 2.087e-07
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.9876 0.203125 0.000000
## Specificity 0.2062 0.986246 1.000000
## Pos Pred Value 0.9844 0.160494 NaN
## Neg Pred Value 0.2469 0.989649 0.993411
## Prevalence 0.9806 0.012780 0.006589
## Detection Rate 0.9685 0.002596 0.000000
## Detection Prevalence 0.9838 0.016174 0.000000
## Balanced Accuracy 0.5969 0.594685 0.500000
test_set%>%
ggplot()+geom_point(aes(date,rpart_preds),color="red")+geom_point(aes(date,new_case_clas))
Classification using random forest model
rf_fit <- randomForest(new_case_clas ~ Administered, data = train_set)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
rf_preds <- predict(rf_fit, test_set)
rf_res<-ifelse(rf_preds<1.5,1,ifelse(rf_preds>=2.5,3,2))
#rpart_preds
confusionMatrix(data = as.factor(rf_res), reference = as.factor(test_set$new_case_clas),positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 4818 53 29
## 2 91 11 4
## 3 2 0 0
##
## Overall Statistics
##
## Accuracy : 0.9643
## 95% CI : (0.9587, 0.9692)
## No Information Rate : 0.9806
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1118
##
## Mcnemar's Test P-Value : 3.53e-08
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.9811 0.171875 0.0000000
## Specificity 0.1546 0.980785 0.9995980
## Pos Pred Value 0.9833 0.103774 0.0000000
## Neg Pred Value 0.1389 0.989188 0.9934079
## Prevalence 0.9806 0.012780 0.0065895
## Detection Rate 0.9621 0.002196 0.0000000
## Detection Prevalence 0.9784 0.021166 0.0003994
## Balanced Accuracy 0.5679 0.576330 0.4997990
test_set%>%
ggplot()+geom_point(aes(date,rf_preds),color="red")+geom_point(aes(date,new_case_clas))
Regression using random forest
newcase_rf<-randomForest(new_case~Administered, data=train_set)
newcase_rf_pred<-predict(newcase_rf,test_set)
test_set%>%
ggplot()+geom_point(aes(date,newcase_rf_pred),color="red")+geom_point(aes(date,new_case))
Divide the table into trainset and testset by date (cutoff point: 2021-07-31)
trainset<-clascomb%>%
filter(date <= as.Date("2021-7-31"))
testset<-clascomb%>%
filter(date >as.Date("2021-07-31"))
#trainset
Machine learning using linear regression
linearfit <- lm(new_case ~ Administered, data = trainset)
linear_pred <- predict(linearfit, testset)
data.frame(
RMSE = RMSE(linear_pred, testset$new_case),
R2 = R2(linear_pred, testset$new_case)
)
## RMSE R2
## 1 2959.376 0.3180471
#testset%>%
# ggplot()+geom_line(aes(date,rpart_preds1),color="red")+geom_point(aes(date,new_case))
ggplot(testset, aes(date, new_case) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ x)
Machine learning using random forest
regr_rf<-randomForest(new_case~Administered, data=trainset)
regr_rf_pred<-predict(newcase_rf,testset)
testset%>%
ggplot()+geom_point(aes(date,regr_rf_pred),color="red")+geom_point(aes(date,new_case))
plot(regr_rf)
sqrt(sum((regr_rf_pred - testset$new_case)^2) / nrow(testset))
## [1] 2083.902